# -*- coding: utf-8 -*-
# Author: xiantingDeng
# File: 03_网络爬虫基础.py
# Time: 17:11


# urllib.request
# 格式：urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

import urllib.request
import re

url = "http://news.baidu.com"
headers = {
'User-Agent':'User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
}
req = urllib.request.Request(url,headers=headers)

res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
pat = '<a href="([a-zA-z]+://[^\s]*)"'
dlist = re.findall(pat,html)
for v in dlist:
    print(v[1] + ":" +v[0])
